Writing has always been my nightmare, and since childhood, I knew I could avoid writing by showing interests in numbers. However, 10 years ago, I would never believe one day, I still need to write something in a statistics class.
I did not have any clues on what to write, but I was not stuck with it for too long. Soon, I started to wonder the possibility to build a writing generator, and its accuracy.

To have a rough idea on the feasibility to build a writing generator, I decide to test with a lyrics dataset and see how far text mining could go.

Data

The first part is to load the data, get the data into tidy format and implement some initial exploration.

load('lyrics.RData')
unwanted <- c("lot", "wanna", "wouldnt", "wasnt", "ha", "na", "ooh", "da", "gonna", "im", "dont", "aint", "wont", "yeah", "la", "oi", "hey")

# tokenize lyrics and stemming
tidydata <- dt_lyrics %>% unnest_tokens(word, lyrics) %>%
  filter(!nchar(word) < 3) %>% 
  filter(!word %in% unwanted) %>%
  anti_join(stop_words) %>%
  mutate(word = wordStem(word))
# export processed data
# do this only once
save(dt_lyrics, file = "processed_lyrics.RData")
# prepare for visualization
vizdata <- tidydata %>% 
  mutate(decade = 
           ifelse(tidydata$year %in% 1970:1979, "1970s", 
           ifelse(tidydata$year %in% 1980:1989, "1980s", 
           ifelse(tidydata$year %in% 1990:1999, "1990s", 
           ifelse(tidydata$year %in% 2000:2009, "2000s", 
           ifelse(tidydata$year %in% 2010:2016, "2010s",
           "other"))))))

Initial exploration

Radar Chart

vizdata1 <- vizdata %>% 
  dplyr::filter(!decade == "other") %>%
  dplyr::select(song, decade, genre) %>%
  group_by(decade, genre) %>% summarize(song_count = n()) %>%
  spread(genre, song_count) 
data <- as.data.frame(vizdata1[1:5, 2:13])
rownames(data) <- c("1970s", "1980s", "1990s", "2000s", "2010s")
data <- rbind(rep(238,12), rep(3079560, 12), data)

coul <- brewer.pal(5, "Set2")
colors_border <- coul
colors_in <- alpha(coul,0.5)

radarchart(data, na.itp = T, axistype = 1,
           pcol = colors_in, plwd = 8, plty = 1,
           cglcol = "grey", cglty = 1, axislabcol = "grey", 
           caxislabels = seq(238, 3079602, 769841), cglwd = 0.8,
           vlcex=2)
legend(x = 1, y = 1.2, legend = rownames(data[-c(1,2),]), bty = "n", pch = 20, 
       col = colors_in, cex=1.2, pt.cex=2)

Lexi Diversity

vizdata2 <- vizdata %>% filter(!decade == "other") %>%
  dplyr::group_by(decade, song) %>%
  mutate(word_count = n_distinct(word)) %>%
  dplyr::select(song, decade, word_count) %>%
  distinct() %>% ungroup()
pirateplot(formula = word_count ~ decade, data = vizdata2, 
   xlab = NULL, ylab = "Song Distinct Word Count", 
   main = "Lexical Diversity Per Decade",
   point.pch = 16, point.cex = 1.5, jitter.val = .12,
   pal = "southpark", point.o = .3, avg.line.o = 1, avg.line.lwd = 2,
   theme = 0, cex.lab = 2, cex.names = 2) 

Data Table

f1 <- vizdata %>%
  count(word, sort = TRUE) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) 
datatable(f1)

Eternal words in lyrics

alltime_words <- vizdata %>% 
  filter(!decade == "other") %>%
  group_by(decade) %>%
  count(word, decade, sort = TRUE) %>%
  slice(seq_len(10)) %>%
  ungroup() %>%
  arrange(decade, n) %>%
  mutate(row = row_number()) 
alltime_words %>%
  ggplot(aes(row, n, fill = decade)) +
    geom_col(show.legend = NULL) +
    labs(x = NULL, y = "Word Count") +
    ggtitle("All time words") + 
    facet_wrap(decade ~ ., scales = "free") +
    scale_x_continuous(  
      breaks = alltime_words$row, 
      labels = alltime_words$word) +
    coord_flip()

word_count <- vizdata %>% 
  filter(!decade == "other") %>%
  count(word, sort = TRUE) 
set.seed(2121)
wordcloud2(word_count[1:500, ], size = 1, minSize = .001, ellipticity = .3, 
           rotateRatio = 1, fontFamily = "American Typewritter", fontWeight = "normal",
           color = "random-dark", backgroundColor = "#D7D0C3")

Sentiment Anlysis

NRC

vizdata4 <- vizdata %>% 
  inner_join(get_sentiments("nrc")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup() %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n))
vizdata4 %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(sentiment ~ ., scales = "free") +
  labs(y = "word count") +
  coord_flip()

bing

vizdata5 <- vizdata %>% 
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup() %>%
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n))
vizdata5 %>% ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(sentiment ~ ., scales = "free") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  coord_flip()  

wordcloud

vizdata6 <- vizdata %>% 
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()  %>%
  mutate(word = reorder(word, n))
set.seed(2121)
with(vizdata6, wordcloud(word[sentiment == "positive"], n, max.words = 200,
                         random.order = T, random.color = T))

with(vizdata6, wordcloud(word[sentiment == "negative"], n, max.words = 200,
                         random.order = T, random.color = T))

TF-IDF

desc_tf_idf <- vizdata %>% filter(!decade == "other") %>%
  count(song, word, sort = TRUE) %>%
  ungroup() %>%
  bind_tf_idf(word, song, n) %>% 
  arrange(-tf_idf)
desc_tf_idf
## # A tibble: 5,045,785 x 6
##    song              word           n    tf   idf tf_idf
##    <chr>             <chr>      <int> <dbl> <dbl>  <dbl>
##  1 pollard           pollard       26 0.743 11.5    8.51
##  2 chee-chee-oo-chee chee         153 0.773 10.1    7.78
##  3 batman-theme      batman        32 1      7.62   7.62
##  4 let-s-rab         rab           29 0.829  8.97   7.43
##  5 loddy-lo          lardi         56 0.667 10.8    7.17
##  6 f10-d-a           ef            19 0.704  9.51   6.69
##  7 i-d-s-t           i.d.s.t       16 0.571 11.5    6.54
##  8 telecommunication telecommun    24 0.522 11.5    5.98
##  9 this-is-industry  industri      25 1      5.53   5.53
## 10 steam-machine     steam         59 1      5.47   5.47
## # … with 5,045,775 more rows
d4 <- vizdata %>% filter(!decade == "other") %>% 
  select(song, genre) %>% distinct()

desc_tf_idf %>% left_join(d4, by = "song") %>%
  filter(!near(tf, 1)) %>%
  arrange(desc(tf_idf)) %>%
  group_by(genre) %>%
  distinct(word, genre, .keep_all = TRUE) %>%
  top_n(10, tf_idf) %>% 
  ungroup() %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>%
  ggplot(aes(word, tf_idf, fill = genre)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~genre, scales = "free") +
  coord_flip() +
  labs(title = "Highest tf-idf words in songs",
       caption = "songs",
       x = NULL, y = "tf-idf")

topic modeling

unwanted_words <- c("pussi", "i'mma", "imma", "i'ma", "swag", "gangsta", 
                    "nigga", "niggaz", "motherfuckin")
word_counts <- vizdata %>% filter(!decade == "other") %>%
  filter(!word %in% unwanted_words) %>%
  count(song, word, sort = TRUE) %>%
  ungroup()

head(word_counts)
## # A tibble: 6 x 3
##   song         word      n
##   <chr>        <chr> <int>
## 1 i-love-you   love    529
## 2 time         time    502
## 3 stay         stai    489
## 4 jingle-bells jingl   455
## 5 shine        shine   452
## 6 fever        fever   416
desc_dtm <- word_counts %>%
  cast_dtm(song, word, n)

desc_dtm
## <<DocumentTermMatrix (documents: 94150, terms: 90364)>>
## Non-/sparse entries: 5036430/8502734170
## Sparsity           : 100%
## Maximal term length: 247
## Weighting          : term frequency (tf)
# time consuming
desc_lda <- LDA(desc_dtm, k = 6, control = list(seed = 1234))
desc_lda
## A LDA_VEM topic model with 6 topics.

Here I try to summarize 6 topics out of the lyrics.

Beta

tidy_lda <- tidy(desc_lda)
top_terms <- tidy_lda %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
head(top_terms, 10)
## # A tibble: 10 x 3
##    topic term       beta
##    <int> <chr>     <dbl>
##  1     1 love    0.132  
##  2     1 babi    0.0536 
##  3     1 girl    0.0279 
##  4     1 feel    0.0254 
##  5     1 heart   0.0193 
##  6     1 night   0.0124 
##  7     1 time    0.0121 
##  8     1 hold    0.0115 
##  9     1 tonight 0.0112 
## 10     1 life    0.00897
top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  group_by(topic, term) %>%    
  arrange(desc(beta)) %>%  
  ungroup() %>%
  ggplot(aes(term, beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  scale_x_reordered() +
  labs(title = "Top 10 terms in each LDA topic",
       x = NULL, y = expression(beta)) +
  facet_wrap(~ topic, ncol = 3, scales = "free")

gamma

lda_gamma <- tidy(desc_lda, matrix = "gamma")
lda_gamma
## # A tibble: 564,900 x 3
##    document        topic     gamma
##    <chr>           <int>     <dbl>
##  1 i-love-you          1 0.570    
##  2 time                1 0.0504   
##  3 stay                1 0.350    
##  4 jingle-bells        1 0.0000683
##  5 shine               1 0.1000   
##  6 fever               1 0.716    
##  7 home                1 0.116    
##  8 hold-on             1 0.371    
##  9 bang-bang           1 0.0704   
## 10 time-after-time     1 0.0878   
## # … with 564,890 more rows
ggplot(lda_gamma, aes(gamma, fill = as.factor(topic))) +
  geom_histogram(show.legend = FALSE) +
  facet_wrap(~ topic, ncol = 3) +
  scale_y_log10() +
  labs(title = "Distribution of probability for each topic",
       y = "Number of documents", x = expression(gamma))

d3 <- vizdata %>% filter(!decade == "other") %>%
  filter(!word %in% unwanted_words) %>% select(song, genre) %>% distinct() 
lda_gamma <- lda_gamma %>% left_join(d3, by = c("document" = "song"))
lda_gamma
## # A tibble: 645,738 x 4
##    document   topic  gamma genre        
##    <chr>      <int>  <dbl> <chr>        
##  1 i-love-you     1 0.570  Rock         
##  2 i-love-you     1 0.570  Pop          
##  3 i-love-you     1 0.570  Hip-Hop      
##  4 i-love-you     1 0.570  Jazz         
##  5 i-love-you     1 0.570  Country      
##  6 i-love-you     1 0.570  Not Available
##  7 time           1 0.0504 Rock         
##  8 time           1 0.0504 Metal        
##  9 time           1 0.0504 Pop          
## 10 time           1 0.0504 Country      
## # … with 645,728 more rows
top_genre <- lda_gamma %>% 
  filter(gamma > 0.9) %>% 
  count(topic, genre, sort = TRUE)

head(top_genre, 10)
## # A tibble: 10 x 3
##    topic genre       n
##    <int> <chr>   <int>
##  1     5 Metal    1709
##  2     2 Rock      813
##  3     6 Hip-Hop   609
##  4     1 Rock      545
##  5     1 Pop       438
##  6     5 Rock      389
##  7     3 Rock      366
##  8     4 Rock      237
##  9     2 Pop       181
## 10     3 Pop       165
top_genre %>%
  group_by(topic) %>%
  top_n(5, n) %>%
  ungroup %>%
  mutate(genre = reorder_within(genre, n, topic)) %>%
  ggplot(aes(genre, n, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  labs(title = "Top genre for each LDA topic",
       x = NULL, y = "Number of songs") +
  coord_flip() +
  scale_x_reordered() +
  facet_wrap(~ topic, ncol = 3, scales = "free")

comaprison

vizdata3 <- vizdata %>% 
  group_by(genre) %>%
  count(word, genre, sort = TRUE) %>%
  slice(seq_len(10)) %>%
  ungroup() %>%
  arrange(genre, n) %>%
  mutate(row = row_number()) 

vizdata3 %>%
  ggplot(aes(row, n, fill = genre)) +
    geom_col(show.legend = NULL) +
    labs(x = NULL, y = "Word Count") +
    ggtitle("Genre top words") + 
    facet_wrap(~genre, scales = "free") +
    scale_x_continuous(  
      breaks = vizdata3$row, 
      labels = vizdata3$word) +
    coord_flip()

Comparisioin bewteen Figure 13 with Figure 10 and 12:
From Figure 10 we could tell that topic 1 is more about love relationship, with words such as “love”, “babi”, “girl”, “heart”, etc. And Figure 12 indicates that plenty of songs in topic 1 are rock songs. Finally, when looking at Figure 13 and the top words for rock songs, indeed they are words like “love”, “babi”, “girl” and “heart”.

Having been through all the analysis above, I am fairly certain that the writing generator could be plausible. The only problem is, I do not have any existing samples to train the generator and then to write an article with my own style, purely because I have not written anything yet. What a paradox!